"Airbnb began in 2008 when two designers who had space to share hosted three travelers looking for a place to stay. Now, millions of hosts and travelers choose to create a free Airbnb account so they can list their space and book unique accommodations anywhere in the world. And Airbnb experience hosts share their passions and interests with both travelers and locals."
"Airbnb helps make sharing easy, enjoyable, and safe. We verify personal profiles and listings, maintain a smart messaging system so hosts and guests can communicate with certainty, and manage a trusted platform to collect and transfer payments."
The source of the data is kaggle.com. The data used in this analysis is for two cities:
from IPython.core.display import display, HTML, Markdown, Javascript
HTML('''<script>
code_show="a"
var r = document.querySelector(':root')
if (code_show === "a") {
code_show="flex"
}
function code_toggle() {
var v = r.getElementsByClassName("jp-Cell-inputWrapper")
if (v.item(0).style.display === "") {
if (code_show === "none") {
for (i = 1; i < v.length-1; i++) {
v.item(i).style.display = "flex";
}
code_show = "flex"
} else {
for (i = 1; i < v.length-1; i++) {
v.item(i).style.display = "none";
}
code_show = "none"
}
}
}
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Toggle Code"></form>''')
from IPython.core.display import display, HTML, Markdown, Javascript
# paths
## 1. Code
myCodePath = "helper_files"
## 2. Data
myDataPath = "data"
import sys
sys.path.append(myCodePath)
from formattingWidth import formatting_width, hideCode, myHTML, myHTMLreplace
from pandasExt import *
formatting_width()
if False:
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()" id="topdiv"><input type="submit" value="Click here: Toggle on/off code."></form>''')
import os
import glob
import numpy as np
import pandas as pd
# turn off scientific notation in pandas
pd.set_option('display.float_format', lambda x: '%.2f' % x)
# show full column text
pd.set_option('display.max_colwidth', None)
# extend output
pd.set_option('display.max_rows', 100000)
from time import time
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn as sb
from pprint import pprint
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import fasttext
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
# Calculate Sentimants?
## The lists of sentiments are saved as pickle files
## To redo this analysis, set these to True
sentiments=False
sentiments_eng=False
import calendar as cal
# calendar variable orders
day_order = [day for day in cal.day_name]
month_order = [mon for mon in cal.month_name][1:]
# location of all data files
all_files = glob.glob(os.path.join(myDataPath , "*.csv") )
# load all datafiles as dataframes, save as a dictionary
dfs = {f.split("/")[-1].replace(".csv",""): pd.read_csv(f, low_memory=False) for f in all_files if "csv" in f}
# add city key to each dataframe
for key in dfs.keys():
if "boston" in key:
dfs[key]['city'] = "Boston"
else:
dfs[key]['city'] = "Seattle"
# concatenate the relevant datafiles
calendar = pd.concat([dfs[key] for key in dfs.keys() if 'calendar' in key]).reset_index(drop=True)
listings = pd.concat([dfs[key] for key in dfs.keys() if 'listings' in key]).reset_index(drop=True)
reviews = pd.concat([dfs[key] for key in dfs.keys() if 'reviews' in key]).reset_index(drop=True)
myHTML(f"Rows and columns in: calendar {calendar.shape}, listings {listings.shape}, reviews {reviews.shape}")
myHTML(f"Unique listing ids in listings {len(listings.id.unique())}")
myHTML(f'''\nColumns in:<br>calendar: {', '.join([str(x).replace('_',' ').title() for x in calendar.columns])},
<br><br> listings: {', '.join([str(x).replace('_',' ').title() for x in listings.columns])},
<br><br> reviews: {', '.join([str(x).replace('_',' ').title() for x in reviews.columns])}''')
myHTML(f"The listings id variable is a unique identifier (number of unique entries for each id: {listings.id.value_counts().unique()}")
listings.review_scores_rating.mean()
93.31600201918224
describe = listings.review_scores_rating.myDescribe()
describe
count 5943.00 mean 93.32 std 8.21 min 20.00 0% 20.00 10% 83.20 20% 90.00 30% 92.00 40% 94.00 50% 96.00 60% 97.00 70% 98.00 80% 100.00 90% 100.00 100% 100.00 max 100.00 Name: review_scores_rating, dtype: float64
# def myBins(df,var):
# describe = df[var].myDescribe()
# intervals = describe.index[~describe.index.isin(['count', 'mean', 'std', 'min','max'])]
# bins = [describe.loc[intervals[ind]] for ind in range(len(intervals)) if describe.loc[intervals[ind]] != describe.loc[intervals[ind-1]]]
# bins = [round(x,2) for x in bins]
# labels = [intervals[ind] for ind in range(len(intervals)) if describe.loc[intervals[ind]] != describe.loc[intervals[ind-1]]]
# labels[-1] = "100%"
# labels = [f"{str(bins[ind])} ({labels[ind].replace('%','th')})" for ind in range(len(labels))]
# return (bins,labels)
# bins, labels = myBins(listings,'review_scores_rating')
# print(bins)
# print (labels)
# def myCut(df,var):
# bins, labels = myBins(df,var)
# var_name = f"{var.replace('_',' ')} Percentiles".title()
# df[var_name] = pd.cut(df[var],
# bins,
# labels=labels[1:],
# include_lowest=True)
# return var_name
from pandasExt import *
new_name = myCut(listings,'review_scores_rating')
[20.0, 83.2, 90.0, 92.0, 94.0, 96.0, 97.0, 98.0, 100.0] ['0%', '10%', '20%', '30%', '40%', '50%', '60%', '70%', '100%']
if isinstance(listings["price"][0],str):
listings["price"]=listings.price.str.extract(r'(\d+\.\d+)').astype(float)
listings[[new_name,'review_scores_rating']]
Waiting = listings.groupby(['city',new_name])['price'].mean().unstack().plot.bar(rot=0,
figsize=[18.70, 8.27],
cmap="RdYlGn",
width=0.9)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5),
title="Ratings Percentiles", title_fontsize = 12);
for i in Waiting.patches:
# print(i)
# Waiting.text(i.get_x(),i.get_height()+1000,str(i.get_height().astype(int)),rotation=0,fontsize=12,weight = "bold", color='white')
Waiting.text(i.get_xy()[0] + i.get_width()*0.5, # X position
i.get_xy()[1] + i.get_height()*0.95,# Y position
f"${str(round(i.get_height(),2))}",
ha="center",
va="center",# Text
rotation=0,
fontsize=10,
weight = "bold",
color='white');
listings.groupby('city').review_scores_rating.mean()
city Boston 91.92 Seattle 94.54 Name: review_scores_rating, dtype: float64
listings.groupby('city').review_scores_accuracy.mean()
city Boston 9.43 Seattle 9.64 Name: review_scores_accuracy, dtype: float64
listings.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7403 entries, 0 to 7402 Data columns (total 95 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 7403 non-null int64 1 listing_url 7403 non-null object 2 scrape_id 7403 non-null int64 3 last_scraped 7403 non-null object 4 name 7403 non-null object 5 summary 7083 non-null object 6 space 5777 non-null object 7 description 7403 non-null object 8 experiences_offered 7403 non-null object 9 neighborhood_overview 4956 non-null object 10 notes 3822 non-null object 11 transit 5179 non-null object 12 access 2096 non-null object 13 interaction 2031 non-null object 14 house_rules 2393 non-null object 15 thumbnail_url 6484 non-null object 16 medium_url 6484 non-null object 17 picture_url 7403 non-null object 18 xl_picture_url 6484 non-null object 19 host_id 7403 non-null int64 20 host_url 7403 non-null object 21 host_name 7401 non-null object 22 host_since 7401 non-null object 23 host_location 7384 non-null object 24 host_about 5235 non-null object 25 host_response_time 6409 non-null object 26 host_response_rate 6409 non-null object 27 host_acceptance_rate 6159 non-null object 28 host_is_superhost 7401 non-null object 29 host_thumbnail_url 7401 non-null object 30 host_picture_url 7401 non-null object 31 host_neighbourhood 6764 non-null object 32 host_listings_count 7401 non-null float64 33 host_total_listings_count 7401 non-null float64 34 host_verifications 7403 non-null object 35 host_has_profile_pic 7401 non-null object 36 host_identity_verified 7401 non-null object 37 street 7403 non-null object 38 neighbourhood 6444 non-null object 39 neighbourhood_cleansed 7403 non-null object 40 neighbourhood_group_cleansed 3818 non-null object 41 city 7403 non-null object 42 state 7403 non-null object 43 zipcode 7358 non-null object 44 market 7389 non-null object 45 smart_location 7403 non-null object 46 country_code 7403 non-null object 47 country 7403 non-null object 48 latitude 7403 non-null float64 49 longitude 7403 non-null float64 50 is_location_exact 7403 non-null object 51 property_type 7399 non-null object 52 room_type 7403 non-null object 53 accommodates 7403 non-null int64 54 bathrooms 7373 non-null float64 55 bedrooms 7387 non-null float64 56 beds 7393 non-null float64 57 bed_type 7403 non-null object 58 amenities 7403 non-null object 59 square_feet 153 non-null float64 60 price 7403 non-null object 61 weekly_price 2901 non-null object 62 monthly_price 2405 non-null object 63 security_deposit 3208 non-null object 64 cleaning_fee 5266 non-null object 65 guests_included 7403 non-null int64 66 extra_people 7403 non-null object 67 minimum_nights 7403 non-null int64 68 maximum_nights 7403 non-null int64 69 calendar_updated 7403 non-null object 70 has_availability 3818 non-null object 71 availability_30 7403 non-null int64 72 availability_60 7403 non-null int64 73 availability_90 7403 non-null int64 74 availability_365 7403 non-null int64 75 calendar_last_scraped 7403 non-null object 76 number_of_reviews 7403 non-null int64 77 first_review 6020 non-null object 78 last_review 6020 non-null object 79 review_scores_rating 5943 non-null float64 80 review_scores_accuracy 5922 non-null float64 81 review_scores_cleanliness 5932 non-null float64 82 review_scores_checkin 5925 non-null float64 83 review_scores_communication 5934 non-null float64 84 review_scores_location 5926 non-null float64 85 review_scores_value 5926 non-null float64 86 requires_license 7403 non-null object 87 license 0 non-null float64 88 jurisdiction_names 3818 non-null object 89 instant_bookable 7403 non-null object 90 cancellation_policy 7403 non-null object 91 require_guest_profile_picture 7403 non-null object 92 require_guest_phone_verification 7403 non-null object 93 calculated_host_listings_count 7403 non-null int64 94 reviews_per_month 6020 non-null float64 dtypes: float64(17), int64(13), object(65) memory usage: 5.4+ MB
calendar.sample(5)
| listing_id | date | available | price | city | |
|---|---|---|---|---|---|
| 2526142 | 2686624 | 2016-12-11 | f | NaN | Seattle |
| 307063 | 14428745 | 2017-05-30 | f | NaN | Boston |
| 169426 | 14854164 | 2017-04-01 | t | $90.00 | Boston |
| 391336 | 8200839 | 2017-05-15 | t | $110.00 | Boston |
| 540758 | 2723274 | 2016-12-29 | f | NaN | Boston |
reviews.sample(5)
| listing_id | id | date | reviewer_id | reviewer_name | comments | city | |
|---|---|---|---|---|---|---|---|
| 27110 | 6691873 | 43592683 | 2015-08-20 | 1809464 | Rudy | Accurate listing! I had a great experience. Becca makes you feel very welcome. There was coffee and creamer as promised. Definitely recommend! | Seattle |
| 128794 | 2931165 | 21955393 | 2014-10-27 | 11826811 | Catherine | We really appreciated our stay with Cynthia ans her lovely family. They were very kind and generous. They made our first trip to Boston very enjoyable. It was a charming place to stay, not too far from the center or the city. \r\n\r\nNous avons passé un très beau séjour chez Cynthia. Elle et sa famille nous ont accueillis très chaleureusement. C'est une charmante maison à distance raisonnable de la ville pour quiconque souhaitant visiter Boston et les environs. Nous y retournerions sans hésiter! | Boston |
| 10573 | 158345 | 3585370 | 2013-02-19 | 837448 | Lynne | Thoroughly lovely long weekend w/my girls at Marcia's. We all enjoyed, again, Marcia's lovely home in this Ballard neighborhood. It was very relaxing, cozy, homey. We enjoyed visiting the Ballard farmer's market, nearby Greenlake, and even south Whidbey Island. Marcia's place has become a great "2nd home" for us when visiting our daughter at UW. Marcia is a considerate and thoughtful host and I look forward to our next stay. Highly recommend!! | Seattle |
| 128669 | 3392423 | 38491249 | 2015-07-16 | 31716342 | Elodie | Tuomas est un hôte très sympathique et parfaitement organisé. \r\nLe logement est très propre, confortable et très beau. Le quartier est le plus joli de Boston! Nous sommes ravies de cette escale à Boston.\r\nLa chambre est un peu petite pour deux mais pour 2 nuits c'est suffisant. | Boston |
| 74966 | 580815 | 40933472 | 2015-08-03 | 21683316 | Emilia | This apartment was perfect for our stay in Seattle - spacious, clean, comfortable, and so well located, within easy walking distance both to downtown and all the great things to do/eat/see in Capitol Hill. The unit was definitely as pictured, charmingly decorated with fun art work. We referred often to the binder on the dining room table containing lots of neighborhood recommendations. Christine, who is awesome, met up with us to give us the keys and was very easy to contact. Would totally stay here again! | Seattle |
import os
import pickle
class SentimentsWordCloud(object):
'''extractEnglish() = extract only english entries
extractStrings() = transform the column of string to a single string - calls extractEnglish is required
extractSentiments() = extracts words with positive, neutral or negative sentiments
wordCloudViz() = create a word cloud visualization from sentiments (positive and negative only)
'''
def __init__(self,df,dfCol,groupby=False,english=False,englishPercent=50,stopwords = [],update=False,proportional=False):
'''
Required inputs: df= relevant dataframe
reviewCol = A column (of strings) of a dataframe
Optional inputs: groupby = a category to groupby
english = whether or not to extract english strings only
englishPercent = if english strings, probability of match
stopwords = a list of word to omit from the word cloud
'''
self.df = df
self.dfCol = dfCol
self.groupby = groupby
self.english = english
self.englishPercent = englishPercent
self.stopwords = stopwords
self.pickleFile = False
self.update = update
self.proportional=proportional
# get current working directory file list
self.file_list = os.listdir(os.getcwd())
# if the file (using the standard naming convention) is in cwd
# load it, and use it instead of recreating the sentitments dictionaries
# unless update is specified
if f"sentiments_{self.groupby}_{self.english}.pickle" in self.file_list and not self.update:
self.pickleFile = True
with open(f"sentiments_{self.groupby}_{self.english}.pickle", 'rb') as handle:
self.sentiments = pickle.load(handle)
else:
# only load the fasttext model if no pickle file is available
# AND english only text is required.
if self.english:
# initialize fasttext model
self.ft = fasttext.load_model('lid.176.bin')
def extractEnglish(self,text):
'''
Use fasttext to extract only english strings
with percentage accuracy self.englishPercent
- creates a boolean for each row
'''
# check for NaN
if not isinstance(text,str):
return False
"""Return a boolean if text is predicted (over perc% to be english)"""
result = self.ft.predict(text.replace("\n"," ").lower(),k=1)
if result[1][0] > self.englishPercent/100 and result[0][0] == '__label__en':
return True
else:
return False
def extractEnglishStrings(self):
'''
# if English only statements are required, filter the dataframe
'''
# if the file (using the standard naming convention) is in cwd
# load it, and use it instead of recreating the sentitments dictionaries
# unless update is specified
if "sentiments_is_English_rows.pickle" in self.file_list and not self.update:
with open("sentiments_is_English_rows.pickle", 'rb') as handle:
is_eng = pickle.load(handle)
else:
# apply extractEnglish to each row
is_eng = self.df[self.dfCol].apply(self.extractEnglish)
# filter the dataframe
self.df = self.df.loc[is_eng]
# this part takes a while, so I am going to pickle the boolean series
# of whether a row is predicted to be in english
with open(f"sentiments_is_English_rows.pickle", 'wb') as handle:
pickle.dump(is_eng, handle, protocol=pickle.HIGHEST_PROTOCOL)
return is_eng
def extractStrings(self):
'''
extract string(s) from the dataframe column
One string if not grouping
One string per group if grouping
'''
# if English only statements are required, filter the dataframe
if self.english:
self.extractEnglishStrings()
if self.groupby:
self.commentsString = self.df.dropna(subset=[self.dfCol]).groupby(self.groupby)[self.dfCol].apply(" ".join)
else:
self.commentsString = pd.Series(" ".join(self.df.dropna(subset=[self.dfCol])[self.dfCol]))
self.commentsString.index = ["All"]
return self.commentsString
def extractSentiments(self):
if self.pickleFile:
return self.sentiments
# extract the strings required
self.extractStrings()
# import required modules
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# create a dictionary to capture the sentiments for each group
self.sentiments ={}
for indx in range(self.commentsString.shape[0]):
commentString = self.commentsString.iloc[indx]
# create a list from the string
words_list=commentString.split()
# create an instance of the analyzer
sid = SentimentIntensityAnalyzer()
# lists to store the sentiments
pos_word_list=[]
neu_word_list=[]
neg_word_list=[]
# dictionary container for the lists
sentiments = {}
# loop through the lists of words
for word in words_list:
if (sid.polarity_scores(word)['compound']) >= 0.5:
pos_word_list.append(word)
elif (sid.polarity_scores(word)['compound']) <= -0.5:
neg_word_list.append(word)
else:
neu_word_list.append(word)
sentiments['Positive'] = pos_word_list
sentiments['Neutral'] = neu_word_list
sentiments['Negative'] = neg_word_list
self.sentiments[self.commentsString.index[indx]] = sentiments
# this part takes a while, so I am going to pickle the results of each run
# then use those files to avoid having to run
pickle_filename = f"sentiments_{self.groupby}_{self.english}.pickle"
with open(pickle_filename, 'wb') as handle:
pickle.dump(self.sentiments, handle, protocol=pickle.HIGHEST_PROTOCOL)
return self.sentiments
def wordCloudViz(self):
# generate sentiments
sentiments_dict = self.extractSentiments()
for item in self.stopwords:
STOPWORDS.add(item)
for key in sentiments_dict:
sentiments_all = sentiments_dict[key]
# ensure that there are words for the word cloud
# if not, move on to the next key
if 0 in [len(sentiments_all[k]) for k in sentiments_all.keys()]:
continue
# If the area of plots is set to be proportional (using a layout N x N, with plots (N-1)*(N-1) and 1*1
# for example if the ratio is 100:1, N**2 -2N -99
# then an adjustment to the width of the wordCloud has to be made (i.e. the number of columns the plot
# occupies)
c = int(len(sentiments_all["Positive"])/len(sentiments_all["Negative"])) -1
import math
# number of columns that generates proportional subplots
ncols = int(2+math.sqrt((1**2) - (4*(-2*c)))/(2*1))
if self.proportional:
wcWidth = int(1050*((ncols-1)/ncols))
else:
wcWidth=1050
wordCloud = {}
for label, value in {'Positive':['salmon','Pastel1'],"Negative":['grey',"Dark2"]}.items():
wordCloud[label] = WordCloud(width = 1600, height = wcWidth,
random_state=1,
background_color=value[0], colormap=value[1],
collocations=False, stopwords = STOPWORDS).generate(" ".join(sentiments_all[label]))
if not self.proportional:
# Set figure size
fig, ax = plt.subplots(1,2, figsize=(14.69, 8.27))
for ind, label in enumerate(wordCloud):
# Display image
ax[ind].imshow(wordCloud[label])
# No axis details
ax[ind].axis("off");
ax[ind].set_title(f"{label} sentiments for {key} reviews".title(), fontsize=14, weight="bold")
plt.tight_layout()
plt.show()
else:
# set area of plots to be proportional (using a layout N x N, with plots (N-1)*(N-1) and 1*1
# for example if the ratio is 100:1, N**2 -2N -99
c = int(len(sentiments_all["Positive"])/len(sentiments_all["Negative"])) -1
import math
ncols = int(2+math.sqrt((1**2) - (4*(-2*c)))/(2*1))
nrows=ncols
fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=[14.7,8.27])
# get gridspec, upper left plot
gs = axs[0, 0].get_gridspec()
# remove all axis
for ax in axs.flatten():
ax.axis("off")
# remove the underlying axes
for ax in axs[0:ncols-1].flatten():
ax.remove()
# larger plot
axbig = fig.add_subplot(gs[0:, 0:ncols-1])
# smaller plot
axsmall=axs[nrows-1][ncols-1]
axbig.axis("off")
fig.tight_layout()
for ind, label in enumerate(wordCloud):
if label == "Positive":
axbig.imshow(wordCloud[label])
else:
axsmall.imshow(wordCloud[label])
fig.suptitle(f"Sentiments for {key} reviews\n(ratio of {c+1}:1)".title(), fontsize=14, weight="bold", y=1.05)
fig.show()
yield
myHTML(f"To create all of the required pickle files took {round(1835.3076219558716,2)} seconds, or {round(1835.3076219558716/60,2)} minutes")
import math
c = 99
int(2+math.sqrt((1**2) - (4*(-2*c)))/(2*1))
16
recreate_pickle_files = False
if recreate_pickle_files:
# start timer
t0=time()
options = [False, 'city', 'listing_id']
for option in options:
for value in [True, False]:
swc = SentimentsWordCloud(reviews,'comments',groupby=option,english=value,update=True)
swc.extractSentiments()
print(f'Time Taken: {time()-t0}')
swc = SentimentsWordCloud(reviews,'comments')
vizGen = swc.wordCloudViz()
next(vizGen)
file_list = os.listdir(os.getcwd())
if "sentiments_is_English_rows.pickle" in file_list:
with open("sentiments_is_English_rows.pickle", 'rb') as handle:
is_eng = pickle.load(handle)
eng_comments = reviews.loc[is_eng,'comments']
nonEng_comments = reviews.loc[~is_eng,'comments']
nonEng_comments.dropna().loc[nonEng_comments.dropna().str.lower().str.contains('die')].iloc[0]
for line in nonEng_comments.dropna().loc[nonEng_comments.dropna().str.lower().str.contains(' die')]:
myHTMLreplace(line, word=' die')
break
swc = SentimentsWordCloud(reviews,'comments',english=True)
vizGen = swc.wordCloudViz()
next(vizGen)
for line in eng_comments.dropna().loc[eng_comments.dropna().str.lower().str.contains(' die')]:
myHTMLreplace(line, word=' die')
break
for line in eng_comments.dropna().loc[eng_comments.dropna().str.lower().str.contains(' rob')]:
myHTMLreplace(line, word=' Rob')
break
swc = SentimentsWordCloud(reviews,'comments',english=True,proportional=True)
vizGen = swc.wordCloudViz()
next(vizGen)
aspect = 14.7/8.27
aspect
1.777509068923821
sent = swc.extractSentiments()
def proportionalPlot(ncols,nrows):
df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'])
fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=[14.7,8.27])
# get gridspec, upper left plot
gs = axs[0, 0].get_gridspec()
# remove all axis
for ax in axs.flatten():
ax.axis("off")
# remove the underlying axes
for ax in axs[0:ncols-1].flatten():
ax.remove()
axbig = fig.add_subplot(gs[0:, 0:ncols-1])
df.plot.bar(ax=axbig)
axbig.axis("off")
axbig.annotate('Big Axes \nGridSpec[1:, -1]', (0.1, 0.5),
xycoords='axes fraction', va='center')
df.plot.kde(ax=axs[nrows-1][ncols-1]);
fig.tight_layout()
english_rows = swc.extractEnglishStrings()
English = reviews[english_rows].dropna(subset=["comments"])
searchDF = English
searchCol= 'comments'
searchWord = 'dead'
def highlightWord(searchDF,searchCol,searchWord,include):
i=0
for row in searchDF[searchDF[searchCol].str.lower().str.contains(f"{searchWord} ",regex=True)].comments:
myHTML(f"\n{searchCol} number {i}".title())
sentences = [f"{sentence}." for sentence in row.split('.') if f'{searchWord} ' in sentence]
for statement in sentences:
myHTMLreplace(statement.lower(),word=f'{searchWord} ')
i+=1
if i == include:
break
highlightWord(searchDF,searchCol,searchWord,10)
# highlightWord(nonEnglish,searchCol,'die')
Grouped by city, english language only: Time Taken: 303.38720989227295 seconds
loaded from pickle: Time Taken: 15.748566150665283 seconds
# display(myHTML(f"For all reviews: The number of negative words {len(sentiments_all['Negative'])}; The number of positive words {len(sentiments_all['Positive'])}"))
def sentimentCount(allComments, cityComments):
# number of positive/negative words
len_all_pos = len(allComments['Positive'])
len_all_neg = len(allComments['Negative'])
len_bos_pos = len(cityComments["Boston"]['Positive'])
len_bos_neg = len(cityComments["Boston"]['Negative'])
len_sea_pos = len(cityComments["Seattle"]['Positive'])
len_sea_neg = len(cityComments["Seattle"]['Negative'])
top_all_neg = pd.Series(allComments['Negative']).value_counts().index[:10]
top_bos_neg = pd.Series(cityComments["Boston"]['Negative']).value_counts().index[:10]
top_sea_neg = pd.Series(cityComments["Seattle"]['Negative']).value_counts().index[:10]
display(myHTML(f'''For all reviews: The number of negative words {len_all_neg}; The number of positive words {len_all_pos};
For a proportion of positive of {round(len_all_pos/(len_all_pos+len_all_neg),2)}.<br>The top 10 negative words are: {", ".join(top_all_neg)}'''.title()))
display(myHTML(f'''For boston reviews: The number of negative words {len_bos_neg}; The number of positive words {len_bos_pos};
For a proportion of positive of {round(len_bos_pos/(len_bos_pos+len_bos_neg),2)}.<br>The top 10 negative words are: {", ".join(top_bos_neg)}'''.title()))
display(myHTML(f'''For seattle reviews: The number of negative words {len_sea_neg}; The number of positive words {len_sea_pos};
For a proportion of positive of {round(len_sea_pos/(len_sea_pos+len_sea_neg),2)}.<br>The top 10 negative words are: {", ".join(top_sea_neg)}'''.title()))
# sentimentCount(sentiments_all, city_sentiment)
# sentimentCount(sentiments_all_eng, city_sentiment_eng)
display(myHTML("<h2>Decision About Merging</h2)"))
None
# If dataframes are to be merged:
## currently, no
if False:
# merge listing with calendar
df = listings.merge(calendar, left_on='id', right_on='listing_id')
# merge result with reviews
df = df.merge(reviews, left_on='id', right_on='listing_id')
# Function: create date variables
def dateVariables(df,dateVar):
df[dateVar] = pd.to_datetime(df[dateVar])
df['year'] = df[dateVar].dt.year
df['month'] = df[dateVar].dt.strftime("%B")
df['month'] = pd.Categorical(df['month'],
categories = month_order,
ordered=True)
df['day'] = df[dateVar].dt.strftime("%A")
df['day'] = pd.Categorical(df['day'],
categories = day_order,
ordered=True)
# create date variables
dateVariables(calendar,'date')
dateVariables(reviews,'date')
dateVariables(listings,'last_scraped')
# reencode available to booleen
calendar['available'] = calendar['available'].replace({"t":True,"f":False})
# reencode price to float
calendar["price"] = calendar.price.str.replace("$","").str.replace(",","").astype(float)
# extract the dataframes for the latest dates
recent_calendar = calendar.loc[calendar.groupby('listing_id').date.idxmax()].reset_index(drop=True)
recent_reviews = reviews.loc[reviews.groupby('listing_id').date.idxmax()].reset_index(drop=True)
[x for x in recent_calendar.listing_id.values if x not in listings.id.values]
[]
[x for x in listings.id.values if x not in recent_calendar.listing_id.values]
[]
[x for x in recent_reviews.listing_id.values if x not in listings.id.values]
[]
missing_reviews = [x for x in listings.id.values if x not in recent_reviews.listing_id.values]
display(HTML(f'''There are {len(missing_reviews)} listings that are not included in the reviews dataframe. That is {round(100*len(missing_reviews)/len(listings),2)}%.
This matches the missing reviews from the listings dataframe (i.e. where Number of Listings is zero): {len(listings.loc[listings.number_of_reviews == 0])}'''))
All listings, in the listings data, are in the calendar data file. But close to 19% are missing from the reviews data.
[x for x in listings.columns if 'review' in x.lower()]
['number_of_reviews', 'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'reviews_per_month']
len(listings.loc[listings.number_of_reviews == 0])
1383
def groupbyPerc(df,var1,var2):
'''
groupbyPerc: group dataframe df by var1 and var2
unstack var2
get percentages for each category in var2 (for each in var1)
generate horizontal bar plot
'''
(df.groupby([var1,var2])
.size()
.unstack(var2)
.apply(lambda x: x*100/x.sum(), axis=1)
.plot.barh(width=0.9,
figsize=[14.70, 8.27]))
plt.title(f"Percentage {var2} by {var1}".title(), fontsize=14, weight="bold")
plt.ylabel(f"{var1}".title())
plt.xlabel("Percentage".title())
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5),
title=f"{var2}".title(), title_fontsize = 12);
def groupbyAvg(df,var1,var2,label='',legend=False):
'''
groupbyPerc: group dataframe df by var1
get average for each category in var2 (for each in var1)
generate horizontal bar plot
'''
(df.groupby([var1])[var2]
.mean()
.plot.barh(width=0.9,
figsize=[14.70, 8.27]))
plt.title(f"Average {var2.replace('_',' ')} by {var1.replace('_',' ')}".title(), fontsize=14, weight="bold")
plt.ylabel(f"{var1}".title())
plt.xlabel(f"Average {var2.replace('_',' ')} {label}".title())
if legend:
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5),
title=f"{var2}".title(), title_fontsize = 12);
groupbyAvg(listings,"city","price",label="($)")
listings.price.myDescribe()
count 7403.00 mean 147.80 std 102.89 min 0.00 0% 0.00 10% 55.00 20% 70.00 30% 85.00 40% 100.00 50% 120.00 60% 145.00 70% 170.00 80% 200.00 90% 275.00 100% 999.00 max 999.00 Name: price, dtype: float64
groupbyPerc(calendar,"day","available")
groupbyPerc(calendar,"month","available")
groupbyPerc(calendar,"year","available")
city_merge
Boston 127.976
Seattle 173.926
Name: price, dtype: float64
calendar.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| listing_id | 2702460.00 | 6950804.94 | 4052439.85 | 3335.00 | 3865312.75 | 7035369.00 | 9504211.25 | 14933461.00 |
| price | 1577579.00 | 162.60 | 143.70 | 10.00 | 79.00 | 120.00 | 200.00 | 7163.00 |
| year | 2702460.00 | 2016.33 | 0.47 | 2016.00 | 2016.00 | 2016.00 | 2017.00 | 2017.00 |
listings.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| id | 7403.00 | 6950001.48 | 4052396.78 | 3335.00 | 3864099.50 | 7035240.00 | 9500804.00 | 14933461.00 |
| scrape_id | 7403.00 | 20160492479532.68 | 400929617.51 | 20160104002432.00 | 20160104002432.00 | 20160104002432.00 | 20160906204935.00 | 20160906204935.00 |
| host_id | 7403.00 | 20210534.87 | 19623093.35 | 4193.00 | 4343163.50 | 14486505.00 | 30283594.00 | 93854106.00 |
| host_listings_count | 7401.00 | 32.22 | 123.59 | 0.00 | 1.00 | 1.00 | 4.00 | 749.00 |
| host_total_listings_count | 7401.00 | 32.22 | 123.59 | 0.00 | 1.00 | 1.00 | 4.00 | 749.00 |
| latitude | 7403.00 | 45.07 | 2.64 | 42.24 | 42.35 | 47.54 | 47.62 | 47.73 |
| longitude | 7403.00 | -97.52 | 25.61 | -122.42 | -122.33 | -122.28 | -71.08 | -71.00 |
| accommodates | 7403.00 | 3.20 | 1.89 | 1.00 | 2.00 | 2.00 | 4.00 | 16.00 |
| bathrooms | 7373.00 | 1.24 | 0.55 | 0.00 | 1.00 | 1.00 | 1.00 | 8.00 |
| bedrooms | 7387.00 | 1.28 | 0.82 | 0.00 | 1.00 | 1.00 | 2.00 | 7.00 |
| beds | 7393.00 | 1.67 | 1.08 | 0.00 | 1.00 | 1.00 | 2.00 | 16.00 |
| square_feet | 153.00 | 856.03 | 647.19 | 0.00 | 420.00 | 770.00 | 1200.00 | 3000.00 |
| guests_included | 7403.00 | 1.56 | 1.20 | 0.00 | 1.00 | 1.00 | 2.00 | 15.00 |
| minimum_nights | 7403.00 | 2.76 | 13.24 | 1.00 | 1.00 | 2.00 | 3.00 | 1000.00 |
| maximum_nights | 7403.00 | 14313.37 | 1162232.13 | 1.00 | 95.50 | 1125.00 | 1125.00 | 99999999.00 |
| availability_30 | 7403.00 | 12.85 | 12.07 | 0.00 | 0.00 | 10.00 | 26.00 | 30.00 |
| availability_60 | 7403.00 | 29.56 | 23.84 | 0.00 | 1.00 | 31.00 | 55.00 | 60.00 |
| availability_90 | 7403.00 | 48.63 | 35.01 | 0.00 | 7.00 | 56.00 | 84.00 | 90.00 |
| availability_365 | 7403.00 | 213.09 | 138.34 | 0.00 | 77.00 | 259.00 | 347.00 | 365.00 |
| number_of_reviews | 7403.00 | 20.68 | 36.73 | 0.00 | 1.00 | 7.00 | 23.00 | 474.00 |
| review_scores_rating | 5943.00 | 93.32 | 8.21 | 20.00 | 91.00 | 96.00 | 99.00 | 100.00 |
| review_scores_accuracy | 5922.00 | 9.54 | 0.82 | 2.00 | 9.00 | 10.00 | 10.00 | 10.00 |
| review_scores_cleanliness | 5932.00 | 9.42 | 1.00 | 2.00 | 9.00 | 10.00 | 10.00 | 10.00 |
| review_scores_checkin | 5925.00 | 9.72 | 0.68 | 2.00 | 10.00 | 10.00 | 10.00 | 10.00 |
| review_scores_communication | 5934.00 | 9.73 | 0.66 | 2.00 | 10.00 | 10.00 | 10.00 | 10.00 |
| review_scores_location | 5926.00 | 9.52 | 0.78 | 2.00 | 9.00 | 10.00 | 10.00 | 10.00 |
| review_scores_value | 5926.00 | 9.32 | 0.89 | 2.00 | 9.00 | 9.00 | 10.00 | 10.00 |
| license | 0.00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| calculated_host_listings_count | 7403.00 | 7.69 | 21.47 | 1.00 | 1.00 | 1.00 | 3.00 | 136.00 |
| reviews_per_month | 6020.00 | 2.03 | 1.97 | 0.01 | 0.59 | 1.37 | 2.89 | 19.15 |
| year | 7403.00 | 2016.00 | 0.00 | 2016.00 | 2016.00 | 2016.00 | 2016.00 | 2016.00 |
reviews.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| listing_id | 153124.00 | 3787517.47 | 3248161.22 | 3353.00 | 1071843.00 | 3139972.00 | 5958674.00 | 14843783.00 |
| id | 153124.00 | 40342403.39 | 24778966.56 | 1021.00 | 20620259.75 | 38463998.00 | 53978187.50 | 99990454.00 |
| reviewer_id | 153124.00 | 21922545.05 | 18816034.68 | 15.00 | 6342335.75 | 17552614.00 | 32977175.50 | 93350341.00 |
| year | 153124.00 | 2014.72 | 1.06 | 2009.00 | 2014.00 | 2015.00 | 2015.00 | 2016.00 |
listings.head(1).T
| 0 | |
|---|---|
| id | 12147973 |
| listing_url | https://www.airbnb.com/rooms/12147973 |
| scrape_id | 20160906204935 |
| last_scraped | 2016-09-07 00:00:00 |
| name | Sunny Bungalow in the City |
| summary | Cozy, sunny, family home. Master bedroom high ceilings. Deck, garden with hens, beehives & play structure. Short walk to charming village with attractive stores, groceries & local restaurants. Friendly neighborhood. Access public transportation. |
| space | The house has an open and cozy feel at the same time. The living room has a flat screen TV. The kitchen has all you need for cooking. We prefer you buy your food but can use the organic oils, herbs, etc. The yard can be seen from sitting room and when the weather allows, the yard is a place children can lose themselves in a safe way. We have 2 bee hives, 6 hens fenced in (sometimes they get out of their coop area & into the yard), 2 rabbits in a hutch and play structure. |
| description | Cozy, sunny, family home. Master bedroom high ceilings. Deck, garden with hens, beehives & play structure. Short walk to charming village with attractive stores, groceries & local restaurants. Friendly neighborhood. Access public transportation. The house has an open and cozy feel at the same time. The living room has a flat screen TV. The kitchen has all you need for cooking. We prefer you buy your food but can use the organic oils, herbs, etc. The yard can be seen from sitting room and when the weather allows, the yard is a place children can lose themselves in a safe way. We have 2 bee hives, 6 hens fenced in (sometimes they get out of their coop area & into the yard), 2 rabbits in a hutch and play structure. You will have access to 2 bedrooms, a living room, kitchen, bathrooms, and yard. Roslindale is quiet, convenient and friendly. For Southern food try Redd's in Rozzie. Italian Delfino's or Sophia's Grotto are great. Birch St Bistro has nice atmostphere--a little pric |
| experiences_offered | none |
| neighborhood_overview | Roslindale is quiet, convenient and friendly. For Southern food try Redd's in Rozzie. Italian Delfino's or Sophia's Grotto are great. Birch St Bistro has nice atmostphere--a little pricier. If you are cooking the Fish Market has fresh fish daily; Tony's makes his own sausages and has Italian foods; for a wide variety of delicious cheeses and chocolates go to the Cheese Cellar on Birch St. |
| notes | NaN |
| transit | The bus stop is 2 blocks away, and frequent. Bus is about a 10 minute ride to the Orange line, forest hills. The commuter rail into down town is a 6 minute walk from our home. Walking is pleasant and restaurants, shops and grocery store are all a 6 minute walk away. |
| access | You will have access to 2 bedrooms, a living room, kitchen, bathrooms, and yard. |
| interaction | NaN |
| house_rules | Clean up and treat the home the way you'd like your home to be treated. No smoking. |
| thumbnail_url | https://a2.muscache.com/im/pictures/c0842db1-ee98-4fe8-870b-d1e2af33855d.jpg?aki_policy=small |
| medium_url | https://a2.muscache.com/im/pictures/c0842db1-ee98-4fe8-870b-d1e2af33855d.jpg?aki_policy=medium |
| picture_url | https://a2.muscache.com/im/pictures/c0842db1-ee98-4fe8-870b-d1e2af33855d.jpg?aki_policy=large |
| xl_picture_url | https://a2.muscache.com/im/pictures/c0842db1-ee98-4fe8-870b-d1e2af33855d.jpg?aki_policy=x_large |
| host_id | 31303940 |
| host_url | https://www.airbnb.com/users/show/31303940 |
| host_name | Virginia |
| host_since | 2015-04-15 |
| host_location | Boston, Massachusetts, United States |
| host_about | We are country and city connecting in our deck and garden. Enjoy our music room, books and flat screen TV with the pastoral backyard of hens, bees, rabbits and an organic garden. |
| host_response_time | NaN |
| host_response_rate | NaN |
| host_acceptance_rate | NaN |
| host_is_superhost | f |
| host_thumbnail_url | https://a2.muscache.com/im/pictures/5936fef0-ba16-45bd-ac33-9226137d0763.jpg?aki_policy=profile_small |
| host_picture_url | https://a2.muscache.com/im/pictures/5936fef0-ba16-45bd-ac33-9226137d0763.jpg?aki_policy=profile_x_medium |
| host_neighbourhood | Roslindale |
| host_listings_count | 1.0 |
| host_total_listings_count | 1.0 |
| host_verifications | ['email', 'phone', 'facebook', 'reviews'] |
| host_has_profile_pic | t |
| host_identity_verified | f |
| street | Birch Street, Boston, MA 02131, United States |
| neighbourhood | Roslindale |
| neighbourhood_cleansed | Roslindale |
| neighbourhood_group_cleansed | NaN |
| city | Boston |
| state | MA |
| zipcode | 02131 |
| market | Boston |
| smart_location | Boston, MA |
| country_code | US |
| country | United States |
| latitude | 42.28 |
| longitude | -71.13 |
| is_location_exact | t |
| property_type | House |
| room_type | Entire home/apt |
| accommodates | 4 |
| bathrooms | 1.5 |
| bedrooms | 2.0 |
| beds | 3.0 |
| bed_type | Real Bed |
| amenities | {TV,"Wireless Internet",Kitchen,"Free Parking on Premises","Pets live on this property",Dog(s),Heating,"Family/Kid Friendly",Washer,Dryer,"Smoke Detector","Fire Extinguisher",Essentials,Shampoo,"Laptop Friendly Workspace"} |
| square_feet | NaN |
| price | $250.00 |
| weekly_price | NaN |
| monthly_price | NaN |
| security_deposit | NaN |
| cleaning_fee | $35.00 |
| guests_included | 1 |
| extra_people | $0.00 |
| minimum_nights | 2 |
| maximum_nights | 1125 |
| calendar_updated | 2 weeks ago |
| has_availability | NaN |
| availability_30 | 0 |
| availability_60 | 0 |
| availability_90 | 0 |
| availability_365 | 0 |
| calendar_last_scraped | 2016-09-06 |
| number_of_reviews | 0 |
| first_review | NaN |
| last_review | NaN |
| review_scores_rating | NaN |
| review_scores_accuracy | NaN |
| review_scores_cleanliness | NaN |
| review_scores_checkin | NaN |
| review_scores_communication | NaN |
| review_scores_location | NaN |
| review_scores_value | NaN |
| requires_license | f |
| license | NaN |
| jurisdiction_names | NaN |
| instant_bookable | f |
| cancellation_policy | moderate |
| require_guest_profile_picture | f |
| require_guest_phone_verification | f |
| calculated_host_listings_count | 1 |
| reviews_per_month | NaN |
| year | 2016 |
| month | September |
| day | Wednesday |
# # create a float from price data
# if isinstance(listings["price"][0],str):
# listings["price"]=listings.price.str.extract(r'(\d+\.\d+)').astype(float)
# nlistings = listings.join(listings.groupby("zipcode").id.transform('size'),rsuffix="_d")
# nlistings = nlistings.rename({"id_d":"zip_count"},axis=1)
# zipOne = nlistings.groupby("zipcode").first().reset_index()
# zipOne['zip_count'] = zipOne.zip_count.astype(int)
def avgBy(df,var,varname,groupby='zipcode'):
'''
avgBy - groupby by, get average of var, join on original dataframe
extract one complete row of the dataframe for each group
'''
if isinstance(df[var][0],str):
df[var]=df[var].str.extract(r'(\d+\.\d+)').astype(float)
df = df.dropna(subset=[var])
plistings = df.join(df.groupby(groupby)[var].transform('mean'),rsuffix="_d")
plistings = plistings.rename({f"{var}_d":varname},axis=1)
pOne = plistings.groupby(groupby).first().reset_index()
pOne[varname] = pOne[varname].apply(lambda x: np.sqrt(x)*10).astype(int)
return pOne
def countBy(df,var,varname,groupby='zipcode'):
'''
avgBy - groupby by, get average of var, join on original dataframe
extract one complete row of the dataframe for each group
'''
if isinstance(df[var][0],str):
df[var]= df[var].str.extract(r'(\d+\.\d+)').astype(float)
df = df.dropna(subset=[var])
plistings = df.join(df.groupby(groupby)[var].transform('size'),rsuffix="_d")
plistings = plistings.rename({f"{var}_d":varname},axis=1)
pOne = plistings.groupby(groupby).first().reset_index()
pOne[varname] = pOne[varname].apply(lambda x: np.sqrt(x)*10).astype(int)
return pOne
avgRatings = avgBy(listings,'review_scores_rating','Average Ratings',groupby='zipcode')
avgPrice = avgBy(listings,'price','Average Price',groupby='zipcode')
countZip = countBy(listings,'id','Count',groupby='zipcode')
# get the coordinates of a bounding box
def coords(grp):
return (grp.longitude.min(), grp.longitude.max(),
grp.latitude.min(), grp.latitude.max())
def mapRegions(df, s=10, labels=False, annot=""):
'''
mapRegions - plots a map of Boston and Seattle, with observations for lat/lon
which depend on dataframe used
inputs - df - a dataframe contain lat lon information (optional information about a third variable)
s - size of points. Either fixed, or depending on a third variable
'''
# to overwrite s type check later
switch=False
if not isinstance(s,int):
switch=True
# load maps
boston_map = plt.imread('boston_zip_map.png')
seattle_map = plt.imread('seattle_zip_map.png')
def coords(grp):
return (grp.longitude.min(), grp.longitude.max(),
grp.latitude.min(), grp.latitude.max())
# create fig with subplots
fig, ax = plt.subplots(1,2, figsize = (14,10))
# unique cities
cities = df.city.unique()
# loop over cities
for i, city in enumerate(cities):
# Map
mapC = plt.imread(f'{city.lower()}_map.png')
# city dataframe
dfC = df.loc[df.city==city]
# Bounding box
BBox = dfC.groupby('city').apply(coords)
BBoxC = BBox.loc[city]
# check if the size is a fixed number or variable
if switch:
sC = dfC[s]
sT=s
# 5th percentile
sc5 = dfC[s].quantile(0.05)
else:
sC=s
sT="Spatial Location"
sc5 = False
# plot points, with size relative to the value of the s variable
ax[i].scatter(dfC.longitude, dfC.latitude, zorder=1, alpha= 0.6, c='b', s=sC)
if labels:
for ind in range(dfC.shape[0]):
# set text color
if sc5 and dfC.iloc[ind][s] < sc5:
tColor='red'
else:
tColor='black'
# print(dfB['longitude'][ind],dfB['latitude'][ind])
# Overall averages
ax[i].annotate(f"{annot}{dfC.iloc[ind][s]}",
(dfC.iloc[ind]['longitude'],dfC.iloc[ind]['latitude']),
size=12, ha='left',
va='top', weight = "normal",
color = tColor,
# xytext=(1, 1), textcoords='axes fraction',
bbox=dict(facecolor = "white", edgecolor="grey", boxstyle='round,pad=0.05')
)
ax[i].set_title(f'Plotting {sT} Data on {city} Map',y=1.02)
ax[i].set_xlim(BBoxC[0],BBoxC[1])
ax[i].set_ylim(BBoxC[2],BBoxC[3])
ax[i].imshow(mapC, zorder=0, extent = BBoxC, aspect= 'auto');
plt.tight_layout();
mapRegions(avgPrice,s='Average Price',labels=True, annot="$")
mapRegions(avgRatings,s='Average Ratings',labels=True, annot="")
2**16
65536
mapRegions(listings, s=5)
mapRegions(countZip,s='Count', labels=True)
Boston: (-71.17178882136899, -71.00009991969033, 42.23594180770681, 42.38998167884297) Seattle: (-122.4172188142562, -122.2406070305399, 47.50508834369015, 47.73335790856482)
"ELOISA".title()
'Eloisa'
!jupyter nbconvert p1_blog_airBnB.ipynb --to html
[NbConvertApp] Converting notebook p1_blog_airBnB.ipynb to html [NbConvertApp] Writing 7879250 bytes to p1_blog_airBnB.html